1 Rents in San Francsisco 2000-2018

# download directly off tidytuesdaygithub repo

rent <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-07-05/rent.csv')

(1) What are the variable types? (2) Do they all correspond to what they really are? (3) Which variables have most missing values?

(1) The following variables are characters:

  • post_id
  • nhood
  • city
  • country
  • address
  • title
  • descr
  • details

The following variables are numeric, more specifically, double-precision floating-point:

  • date
  • year
  • price
  • beds
  • baths
  • sqft
  • room_in_apt
  • lat
  • lon
  1. Yes, all variables’ data types correspond to what they really are. The variable “date” is saved as a double. The double doesn’t imply that it is a date but in R dates are stored as doubles.

(3) The variable “descr” has the most missing values (~198k), followed by “address” (~197k) and “lon” (~196k)

# (1)
glimpse(rent)
## Rows: 200,796
## Columns: 17
## $ post_id     <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127", "pre…
## $ date        <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 20060411…
## $ year        <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 2009, 2006…
## $ nhood       <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ city        <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ county      <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ price       <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 450, 1395, 1…
## $ beds        <dbl> 2, 2, 2, 1, 1, 1, 1, 3, NA, 2, 2, 5, 4, 0, 4, 1, 3, 3, 1, …
## $ baths       <dbl> 2, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA, 3, NA, NA, NA…
## $ sqft        <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, NA, 2581, 1756, N…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ address     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lat         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.5, NA, …
## $ lon         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ title       <chr> "$1250 / 2br - 2BR/2BA   1145 ALAMEDA DE LAS PULGAS", "$12…
## $ descr       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ details     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p class=…
# (2)
glimpse(rent)
## Rows: 200,796
## Columns: 17
## $ post_id     <chr> "pre2013_134138", "pre2013_135669", "pre2013_127127", "pre…
## $ date        <dbl> 20050111, 20050126, 20041017, 20120601, 20041021, 20060411…
## $ year        <dbl> 2005, 2005, 2004, 2012, 2004, 2006, 2007, 2017, 2009, 2006…
## $ nhood       <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ city        <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ county      <chr> "alameda", "alameda", "alameda", "alameda", "alameda", "al…
## $ price       <dbl> 1250, 1295, 1100, 1425, 890, 825, 1500, 2925, 450, 1395, 1…
## $ beds        <dbl> 2, 2, 2, 1, 1, 1, 1, 3, NA, 2, 2, 5, 4, 0, 4, 1, 3, 3, 1, …
## $ baths       <dbl> 2, NA, NA, NA, NA, NA, 1, NA, 1, NA, NA, NA, 3, NA, NA, NA…
## $ sqft        <dbl> NA, NA, NA, 735, NA, NA, NA, NA, NA, NA, NA, 2581, 1756, N…
## $ room_in_apt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ address     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ lat         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 37.5, NA, …
## $ lon         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ title       <chr> "$1250 / 2br - 2BR/2BA   1145 ALAMEDA DE LAS PULGAS", "$12…
## $ descr       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ details     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "<p class=…
# (3)
skimr::skim(rent)
Data summary
Name rent
Number of rows 200796
Number of columns 17
_______________________
Column type frequency:
character 8
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
post_id 0 1.00 9 14 0 200796 0
nhood 0 1.00 4 43 0 167 0
city 0 1.00 5 19 0 104 0
county 1394 0.99 4 13 0 10 0
address 196888 0.02 1 38 0 2869 0
title 2517 0.99 2 298 0 184961 0
descr 197542 0.02 13 16975 0 3025 0
details 192780 0.04 4 595 0 7667 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
date 0 1.00 2.01e+07 44694.07 2.00e+07 2.01e+07 2.01e+07 2.01e+07 2.02e+07 ▁▇▁▆▃
year 0 1.00 2.01e+03 4.48 2.00e+03 2.00e+03 2.01e+03 2.01e+03 2.02e+03 ▁▇▁▆▃
price 0 1.00 2.14e+03 1427.75 2.20e+02 1.30e+03 1.80e+03 2.50e+03 4.00e+04 ▇▁▁▁▁
beds 6608 0.97 1.89e+00 1.08 0.00e+00 1.00e+00 2.00e+00 3.00e+00 1.20e+01 ▇▂▁▁▁
baths 158121 0.21 1.68e+00 0.69 1.00e+00 1.00e+00 2.00e+00 2.00e+00 8.00e+00 ▇▁▁▁▁
sqft 136117 0.32 1.20e+03 5000.22 8.00e+01 7.50e+02 1.00e+03 1.36e+03 9.00e+05 ▇▁▁▁▁
room_in_apt 0 1.00 0.00e+00 0.04 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.00e+00 ▇▁▁▁▁
lat 193145 0.04 3.77e+01 0.35 3.36e+01 3.74e+01 3.78e+01 3.78e+01 4.04e+01 ▁▁▅▇▁
lon 196484 0.02 -1.22e+02 0.78 -1.23e+02 -1.22e+02 -1.22e+02 -1.22e+02 -7.42e+01 ▇▁▁▁▁

Make a plot that shows the top 20 cities in terms of % of classifieds between 2000-2018. You need to calculate the number of listings by city, and then convert that number to a %

# YOUR CODE GOES HERE
total_number_of_listings <- dim(rent)[1] # count how many listings there are in the dataset

city_rent <- rent %>%  
  group_by(city) %>% # group by city
  summarise(total_city_count = n()) %>% # count how many offers there are per city
  mutate(total_city_count_percent = (total_city_count / total_number_of_listings) ) %>% # calculate the percentage of listings for each city
  arrange(desc(total_city_count_percent)) %>% # arrange in descending order
  slice_max(order_by = total_city_count_percent, n = 20) %>%  # Pick the first 20
  ggplot(aes(x = total_city_count_percent, # create plot, x-axis is the above calculated percentage of listings 
             y = fct_reorder(city, total_city_count_percent))) + # y-axis represents the cities reordered so that the city with highest listings comes at the top
  geom_col() + # Create a bar plot
  labs(title = "San Francisco accounts for more than a quarter of all rental classifieds", # add title, subtitle, caption
       subtitle = "% of Craigslist listings, 2000-2018", 
       y = NULL, 
       x = NULL, 
       caption = "Source: Pennington, Kate (2018). Bay Area Craigslist Rental Housing Posts, 2000-2018") + 
  scale_x_continuous(labels = scales::percent) # make x-axis show percentages

city_rent

Make a plot that shows the evolution of median prices in San Francisco for 0, 1, 2, and 3 bedrooms listings

# YOUR CODE GOES HERE
sf_rent <- rent %>% 
  filter (city == "san francisco", beds <= 3) %>% # filter for listings in San Francisco with <= 3 beds
  group_by(beds, year) %>% # group by number of beds and year
  summarise(median_price = median(price)) # calculate the median price for each combination of beds and year
 
p <- ggplot(sf_rent, aes(x = year, y = median_price, # create plot
          color = as.character(beds))) + # color the plot according to number of beds, transform "beds" into character to create distinct groups 
  geom_line() + # create a line plot
  facet_wrap(~beds, nrow = 1) + # create one plot for each number of beds, specify nrow to 1 to ensure that plots are next to each other
  labs(title = "San Francisco rents have been steadily increasing", # add title, subtitle, caption
       subtitle = "0 to 3-bed listings, 2000-2018", 
       x = NULL, y = NULL, 
       caption = "Source: Pennington, Kate (2018). Bay Area Craigslist Rental Housing Posts, 2000-2018") + 
  theme(legend.position="none") # remove legend 
p

Finally, make a plot that shows median rental prices for the top 12 cities in the Bay area.

# YOUR CODE GOES HERE

top_12 <- rent %>%  
  filter(beds == 1) %>% # filter for 1-bedroom listings
  group_by(city) %>% # group by city
  summarise(sum_rental_price = sum(price)) %>% # calculate the sum of all listings per city
  arrange(desc(sum_rental_price)) %>% # arrange in descending order according to the above-calculated sum
  slice_max(order_by = sum_rental_price, n = 12) # take the top 12 cities


rent_select <- rent[rent$city %in% top_12$city,] # select the top 12 cities from the overall data set

one_bedroom_rent <- rent_select %>% 
  
  filter(beds == 1) %>% # filter for 1-bedroom listings
  group_by(city, year) %>% # group by city and year
  summarise(median_rental_price = median(price)) # calculate the median price per city and year
  

one_bedroom_rent %>% 
  ggplot(aes(x = year, y = median_rental_price, # create plot
             color = as.character(city))) + # specify that each city gets distinct color
  geom_line() + # create a line plot
  facet_wrap(~city) + # create one plot for each city
  labs(title = "Rental prices for 1-bedroom flats in the Bay Area", # specify title and caption
       x = NULL, y = NULL, 
       caption = "Source: Pennington, Kate (2018). Bay Area Craigslist Rental Housing Posts, 2000-2018") + 
  theme(legend.position="none") # remove legend 

What can you infer from these plots? Don’t just explain what’s in the graph, but speculate or tell a short story (1-2 paragraphs max).

Prices in all top 12 cities have heavily increased from 2000 until 2018. The reason for that might be the continuing success of the tech sector in the Bay Area. This leads to exorbitant wages and therefore a lot of pressure on the housing market.

However, the increase is interrupted by years in which prices decreased. One such decrease is prevalent from ~2000-2005 which corresponds with the burst of the dot-com bubble. In 2002, the Nasdaq fell 78% from its peak which led to many online shopping companies to shut down. Many of these companies were headquartered in the Bay Area, which probably led to more unemployment and therefore lower pressure on the housing market. The second dent appeared ~2009. In 2009, the financial crisis dominated the headlines and unemployment rose, decreasing housing prices. Interestingly, there is a third dent around 2017-2018. However, NASDAQ rose during that period, therefore, we don’t think that the reason can be found in the overall economy. We assume that with housing prices more than doubling in some cities, people started moving away from the Bay Area and maybe working remotely in other places. There might be a trend behind this, heralded by Elon Musk who moved Tesla’s headquarter away from the Bay Area and to Texas.

2 Analysis of movies- IMDB dataset

movies <- read_csv(here::here("data", "movies.csv"))
glimpse(movies)
## Rows: 2,961
## Columns: 11
## $ title               <chr> "Avatar", "Titanic", "Jurassic World", "The Avenge…
## $ genre               <chr> "Action", "Drama", "Action", "Action", "Action", "…
## $ director            <chr> "James Cameron", "James Cameron", "Colin Trevorrow…
## $ year                <dbl> 2009, 1997, 2015, 2012, 2008, 1999, 1977, 2015, 20…
## $ duration            <dbl> 178, 194, 124, 173, 152, 136, 125, 141, 164, 93, 1…
## $ gross               <dbl> 7.61e+08, 6.59e+08, 6.52e+08, 6.23e+08, 5.33e+08, …
## $ budget              <dbl> 2.37e+08, 2.00e+08, 1.50e+08, 2.20e+08, 1.85e+08, …
## $ cast_facebook_likes <dbl> 4834, 45223, 8458, 87697, 57802, 37723, 13485, 920…
## $ votes               <dbl> 886204, 793059, 418214, 995415, 1676169, 534658, 9…
## $ reviews             <dbl> 3777, 2843, 1934, 2425, 5312, 3917, 1752, 1752, 35…
## $ rating              <dbl> 7.9, 7.7, 7.0, 8.1, 9.0, 6.5, 8.7, 7.5, 8.5, 7.2, …
skimr::skim(movies)
Data summary
Name movies
Number of rows 2961
Number of columns 11
_______________________
Column type frequency:
character 3
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 83 0 2907 0
genre 0 1 5 11 0 17 0
director 0 1 3 32 0 1366 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2.00e+03 9.95e+00 1920.0 2.00e+03 2.00e+03 2.01e+03 2.02e+03 ▁▁▁▂▇
duration 0 1 1.10e+02 2.22e+01 37.0 9.50e+01 1.06e+02 1.19e+02 3.30e+02 ▃▇▁▁▁
gross 0 1 5.81e+07 7.25e+07 703.0 1.23e+07 3.47e+07 7.56e+07 7.61e+08 ▇▁▁▁▁
budget 0 1 4.06e+07 4.37e+07 218.0 1.10e+07 2.60e+07 5.50e+07 3.00e+08 ▇▂▁▁▁
cast_facebook_likes 0 1 1.24e+04 2.05e+04 0.0 2.24e+03 4.60e+03 1.69e+04 6.57e+05 ▇▁▁▁▁
votes 0 1 1.09e+05 1.58e+05 5.0 1.99e+04 5.57e+04 1.33e+05 1.69e+06 ▇▁▁▁▁
reviews 0 1 5.03e+02 4.94e+02 2.0 1.99e+02 3.64e+02 6.31e+02 5.31e+03 ▇▁▁▁▁
rating 0 1 6.39e+00 1.05e+00 1.6 5.80e+00 6.50e+00 7.10e+00 9.30e+00 ▁▁▆▇▁

2.1 Use your data import, inspection, and cleaning skills to answer the following:

  • Are there any missing values (NAs)? Are all entries distinct or are there duplicate entries? After using the skim function on the dataset, we can conclude that there are no missing values. The distinct function shows that all entries in the dataset are distinct

  • Produce a table with the count of movies by genre, ranked in descending order

skim(movies) # no, there are no missing values
Data summary
Name movies
Number of rows 2961
Number of columns 11
_______________________
Column type frequency:
character 3
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 1 83 0 2907 0
genre 0 1 5 11 0 17 0
director 0 1 3 32 0 1366 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2.00e+03 9.95e+00 1920.0 2.00e+03 2.00e+03 2.01e+03 2.02e+03 ▁▁▁▂▇
duration 0 1 1.10e+02 2.22e+01 37.0 9.50e+01 1.06e+02 1.19e+02 3.30e+02 ▃▇▁▁▁
gross 0 1 5.81e+07 7.25e+07 703.0 1.23e+07 3.47e+07 7.56e+07 7.61e+08 ▇▁▁▁▁
budget 0 1 4.06e+07 4.37e+07 218.0 1.10e+07 2.60e+07 5.50e+07 3.00e+08 ▇▂▁▁▁
cast_facebook_likes 0 1 1.24e+04 2.05e+04 0.0 2.24e+03 4.60e+03 1.69e+04 6.57e+05 ▇▁▁▁▁
votes 0 1 1.09e+05 1.58e+05 5.0 1.99e+04 5.57e+04 1.33e+05 1.69e+06 ▇▁▁▁▁
reviews 0 1 5.03e+02 4.94e+02 2.0 1.99e+02 3.64e+02 6.31e+02 5.31e+03 ▇▁▁▁▁
rating 0 1 6.39e+00 1.05e+00 1.6 5.80e+00 6.50e+00 7.10e+00 9.30e+00 ▁▁▆▇▁
print(paste("Number of duplicate values: ", sum(duplicated(movies)))) # no, there are no duplicate entries
## [1] "Number of duplicate values:  0"
movies_count_list <- movies %>%  # assigning a variable 
    group_by(genre) %>%           # grouping the movie dataset by genre
    summarize(movies_count = n()) %>%  # using the summarise function to count the no of movies in each genre
    arrange(desc(movies_count))       #a rranging the result in descsending order on the basis of movies_count
movies_count_list
## # A tibble: 17 × 2
##    genre       movies_count
##    <chr>              <int>
##  1 Comedy               848
##  2 Action               738
##  3 Drama                498
##  4 Adventure            288
##  5 Crime                202
##  6 Biography            135
##  7 Horror               131
##  8 Animation             35
##  9 Fantasy               28
## 10 Documentary           25
## 11 Mystery               16
## 12 Sci-Fi                 7
## 13 Family                 3
## 14 Musical                2
## 15 Romance                2
## 16 Western                2
## 17 Thriller               1
  • Produce a table with the average gross earning and budget (gross and budget) by genre. Calculate a variable return_on_budget which shows how many $ did a movie make at the box office for each $ of its budget. Rank genres by this return_on_budget in descending order
    movies %>%                   #the movie dataset
    group_by(genre) %>%         #Grouping the dataset by genre
    summarize(avg_gross_earning = mean(gross), #using the summarise function to calculate mean gross earnings
            avg_budget = mean(budget),         #calculating the mean budget
            return_on_budget = avg_gross_earning/avg_budget) %>% #creating a new variable and assigning mean                                                                           gross/mean budget to it
            arrange(desc(return_on_budget))                               #arranging the result by the new variable
## # A tibble: 17 × 4
##    genre       avg_gross_earning avg_budget return_on_budget
##    <chr>                   <dbl>      <dbl>            <dbl>
##  1 Musical             92084000    3189500          28.9    
##  2 Family             149160478.  14833333.         10.1    
##  3 Western             20821884    3465000           6.01   
##  4 Documentary         17353973.   5887852.          2.95   
##  5 Horror              37713738.  13504916.          2.79   
##  6 Fantasy             42408841.  17582143.          2.41   
##  7 Comedy              42630552.  24446319.          1.74   
##  8 Mystery             67533021.  39218750           1.72   
##  9 Animation           98433792.  61701429.          1.60   
## 10 Biography           45201805.  28543696.          1.58   
## 11 Adventure           95794257.  66290069.          1.45   
## 12 Drama               37465371.  26242933.          1.43   
## 13 Crime               37502397.  26596169.          1.41   
## 14 Romance             31264848.  25107500           1.25   
## 15 Action              86583860.  71354888.          1.21   
## 16 Sci-Fi              29788371.  27607143.          1.08   
## 17 Thriller                2468     300000           0.00823
  • Produce a table that shows the top 15 directors who have created the highest gross revenue in the box office. Don’t just show the total gross amount, but also the mean, median, and standard deviation per director.
movies %>%                    #movie dataset
    group_by(director) %>%    #grouping by director
    summarize(total_gross_amount = sum(gross),  #using summarise fn to find the total_gross_amount
        mean_gross_amount = mean(gross),        #using the summarise fn to find mean_gross_amount
        median_gross_amount = median(gross),    #using the summarise fn to find median_gross_amount
        standard_deviation_gross_amount = sd(gross)) %>%  #using the summarise fn to find sd_gross_amount
    arrange(desc(total_gross_amount)) %>%         #arranging in descending orer by total_gross_amount
    slice_max(order_by = total_gross_amount, n=15) #displaying the top 15 values using slice_max fn
## # A tibble: 15 × 5
##    director          total_gross_amount mean_gross_amount median_gross…¹ stand…²
##    <chr>                          <dbl>             <dbl>          <dbl>   <dbl>
##  1 Steven Spielberg          4014061704        174524422.     164435221   1.01e8
##  2 Michael Bay               2231242537        171634041.     138396624   1.27e8
##  3 Tim Burton                2071275480        129454718.      76519172   1.09e8
##  4 Sam Raimi                 2014600898        201460090.     234903076   1.62e8
##  5 James Cameron             1909725910        318287652.     175562880.  3.09e8
##  6 Christopher Nolan         1813227576        226653447      196667606.  1.87e8
##  7 George Lucas              1741418480        348283696      380262555   1.46e8
##  8 Robert Zemeckis           1619309108        124562239.     100853835   9.13e7
##  9 Clint Eastwood            1378321100         72543216.      46700000   7.55e7
## 10 Francis Lawrence          1358501971        271700394.     281666058   1.35e8
## 11 Ron Howard                1335988092        111332341      101587923   8.19e7
## 12 Gore Verbinski            1329600995        189942999.     123207194   1.54e8
## 13 Andrew Adamson            1137446920        284361730      279680930.  1.21e8
## 14 Shawn Levy                1129750988        102704635.      85463309   6.55e7
## 15 Ridley Scott              1128857598         80632686.      47775715   6.88e7
## # … with abbreviated variable names ¹​median_gross_amount,
## #   ²​standard_deviation_gross_amount
  • Finally, ratings. Produce a table that describes how ratings are distributed by genre. We don’t want just the mean, but also, min, max, median, SD and some kind of a histogram or density graph that visually shows how ratings are distributed.
movies_ratings_by_genre <- movies %>%     #assigning the custom table to a variable
    group_by(genre) %>%                   #grouping the movie dataset by genre
    summarize(mean_rating = mean(rating), #using the summarise fn to find mean rating 
        min_rating = min(rating),         #using the summarise fn to find minimum rating 
        max_rating = max(rating),         #using the summarise fn to find maximum rating
        median_rating = median(rating),   #using the summarise fn to find median rating
        sd_rating = sd(rating)) %>%       #using the summarise fn to find standard deviation of rating
    arrange(desc(mean_rating))            #arranging the result in descending order by mean_rating

movies_ratings_by_genre
## # A tibble: 17 × 6
##    genre       mean_rating min_rating max_rating median_rating sd_rating
##    <chr>             <dbl>      <dbl>      <dbl>         <dbl>     <dbl>
##  1 Biography          7.11        4.5        8.9          7.2      0.760
##  2 Crime              6.92        4.8        9.3          6.9      0.849
##  3 Mystery            6.86        4.6        8.5          6.9      0.882
##  4 Musical            6.75        6.3        7.2          6.75     0.636
##  5 Drama              6.73        2.1        8.8          6.8      0.917
##  6 Documentary        6.66        1.6        8.5          7.4      1.77 
##  7 Sci-Fi             6.66        5          8.2          6.4      1.09 
##  8 Animation          6.65        4.5        8            6.9      0.968
##  9 Romance            6.65        6.2        7.1          6.65     0.636
## 10 Adventure          6.51        2.3        8.6          6.6      1.09 
## 11 Family             6.5         5.7        7.9          5.9      1.22 
## 12 Action             6.23        2.1        9            6.3      1.03 
## 13 Fantasy            6.15        4.3        7.9          6.45     0.959
## 14 Comedy             6.11        1.9        8.8          6.2      1.02 
## 15 Horror             5.83        3.6        8.5          5.9      1.01 
## 16 Western            5.7         4.1        7.3          5.7      2.26 
## 17 Thriller           4.8         4.8        4.8          4.8     NA
ggplot(movies, aes(x = rating)) +      #plotting a histogram using movie dataset with rating on the x-axis
    geom_density() +                    #this specifies that the plot is a density plot
    facet_wrap(~genre) +                #facet_wrap by genre to see how ratings are distributed among genre
    labs(title = "Distribution of ratings by genre", 
         x = "Rating", y = NULL)

2.2 Use ggplot to answer the following

  • Examine the relationship between gross and cast_facebook_likes. Produce a scatterplot and write one sentence discussing whether the number of facebook likes that the cast has received is likely to be a good predictor of how much money a movie will make at the box office. What variable are you going to map to the Y- and X- axes?
# create a plot with the cast's facebook likes on the x-axis and the gross amount on the y-axis. The reason for the variable assignment to the axes is that we want to examine how the facebook likes affect the gross, not the other way around. The affected variable usually goes on the y-axis 
ggplot(movies, aes(x=cast_facebook_likes, y=gross)) + 
  geom_point() + 
  scale_x_log10() + 
  scale_y_log10() + 
  geom_smooth(method = "lm", se = FALSE) +  # add line of best fit according to a linear regression model
  labs(title = "Relationship between gross amount and cast's facebook likes", 
       x = "Cast facebook likes", 
       y = "Gross amount")

As observed from the scatterplot, the general consensus seems to be that the number of likes is a good measure of how well a movie will do at the box office.This could be because most of the people go through the no of likes that a movie has recieved on facebook and then decide whether to watch the movie or not. The trend shows that most of the movies that have recieved higher number of likes have also done well at the box office. We have plotted the cast_facebook_likes on x-axis and the gross earnings on the y-axis

  • Examine the relationship between gross and budget. Produce a scatterplot and write one sentence discussing whether budget is likely to be a good predictor of how much money a movie will make at the box office.
#using movie dataset to find the correlation between budget and gross earnings of movies. geom_point is used to create a scatterplot and the axis have been scaled to visualise the results better
ggplot(movies, aes(x=budget,y=gross)) + 
    geom_point() + 
    scale_x_log10() + 
    scale_y_log10() + 
    geom_smooth(method = "lm", se = FALSE) + # add line of best fit according to a linear regression model
    labs(title = "Relationship between a movies' gross amount and budget", 
         x = "Budget", y = "Gross amount")

As observed from the scatterplot, movies with higher budget are more likely to do well at the box office apart from some outliers.

-Examine the relationship between gross and rating. Produce a scatterplot, faceted by genre and discuss whether IMDB ratings are likely to be a good predictor of how much money a movie will make at the box office. Is there anything strange in this dataset?

For documentary and sci-fi movies, we have a negative correlation between the gross amount and the rating. Therefore, this dataset suggests that the more one spends in producing documentaries and sci-fi movies, the worse the rating is going to get. However, one has to point out that our samples for both documentary and sci-fi movies are small. Therefore, we can’t be certain that the sample correlation is also prevalent in the entire population.

#This chunk of code is used to plot the correlation between gross and rating across various genres
ggplot(movies, aes(y = gross, x = rating)) + 
    geom_point() + 
    facet_wrap(~genre) + 
    geom_smooth(method = "lm", se = FALSE) + # add line of best fit according to a linear regression model
    labs(title = "Relationship between a movies' gross amount anbd rating", 
         y = "Gross amount", x = "Rating")

3 Returns of financial stocks

nyse <- read_csv(here::here("data","nyse.csv"))

Based on this dataset, create a table and a bar plot that shows the number of companies per sector, in descending order

# YOUR CODE GOES HERE
com_sector_nyse <- nyse %>% 
  group_by(sector) %>%  #group by sector
  summarise(num_com_per_sec = count(sector)) %>%  #count sector
  arrange(desc(num_com_per_sec)) #arrange in descending order

com_sector_nyse
## # A tibble: 12 × 2
##    sector                num_com_per_sec
##    <chr>                           <int>
##  1 Finance                            97
##  2 Consumer Services                  79
##  3 Public Utilities                   60
##  4 Capital Goods                      45
##  5 Health Care                        45
##  6 Energy                             42
##  7 Technology                         40
##  8 Basic Industries                   39
##  9 Consumer Non-Durables              31
## 10 Miscellaneous                      12
## 11 Transportation                     10
## 12 Consumer Durables                   8
com_sector_nyse %>% 
  ggplot(aes(x = fct_reorder(sector,desc(num_com_per_sec)), y = num_com_per_sec)) + #Plot in descending order
  geom_col() + 
  labs(title = "Number of companies per sector", 
       x = "Sector name", 
       y = "Numbers of companies")

Next, let’s choose some stocks and their ticker symbols and download some data. You MUST choose 6 different stocks from the ones listed below; You should, however, add SPY which is the SP500 ETF (Exchange Traded Fund).

# Notice the cache=TRUE argument inthe chunk options. Because getting data is time consuming, 
# cache=TRUE means that once it downloads data, the chunk will not run again next time you knit your Rmd

myStocks <- c("AAPL","JPM","DIS","DPZ","ANF","TSLA","SPY" ) %>% #select the six stocks
  tq_get(get  = "stock.prices",
         from = "2011-01-01",
         to   = "2022-08-31") %>%
  group_by(symbol) 

glimpse(myStocks) # examine the structure of the resulting data frame
## Rows: 20,545
## Columns: 8
## Groups: symbol [7]
## $ symbol   <chr> "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL", "AAPL…
## $ date     <date> 2011-01-03, 2011-01-04, 2011-01-05, 2011-01-06, 2011-01-07, …
## $ open     <dbl> 11.6, 11.9, 11.8, 12.0, 11.9, 12.1, 12.3, 12.3, 12.3, 12.4, 1…
## $ high     <dbl> 11.8, 11.9, 11.9, 12.0, 12.0, 12.3, 12.3, 12.3, 12.4, 12.4, 1…
## $ low      <dbl> 11.6, 11.7, 11.8, 11.9, 11.9, 12.0, 12.1, 12.2, 12.3, 12.3, 1…
## $ close    <dbl> 11.8, 11.8, 11.9, 11.9, 12.0, 12.2, 12.2, 12.3, 12.3, 12.4, 1…
## $ volume   <dbl> 4.45e+08, 3.09e+08, 2.56e+08, 3.00e+08, 3.12e+08, 4.49e+08, 4…
## $ adjusted <dbl> 10.05, 10.10, 10.18, 10.18, 10.25, 10.44, 10.42, 10.50, 10.54…

Financial performance analysis depend on returns; If I buy a stock today for 100 and I sell it tomorrow for 101.75, my one-day return, assuming no transaction costs, is 1.75%. So given the adjusted closing prices, our first step is to calculate daily and monthly returns.

#calculate daily returns
myStocks_returns_daily <- myStocks %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "daily", 
               type       = "log",
               col_rename = "daily_returns",
               cols = c(nested.col))  

#calculate monthly  returns
myStocks_returns_monthly <- myStocks %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "monthly", 
               type       = "arithmetic",
               col_rename = "monthly_returns",
               cols = c(nested.col)) 

#calculate yearly returns
myStocks_returns_annual <- myStocks %>%
  group_by(symbol) %>%
  tq_transmute(select     = adjusted, 
               mutate_fun = periodReturn, 
               period     = "yearly", 
               type       = "arithmetic",
               col_rename = "yearly_returns",
               cols = c(nested.col))

Create a table where you summarise monthly returns for each of the stocks and SPY; min, max, median, mean, SD.

# YOUR CODE GOES HERE
Monthly_return <- myStocks_returns_monthly %>%
  group_by(symbol) %>% # group by symbol
  summarise(min = min(monthly_returns), 
            max = max(monthly_returns), 
            median = median(monthly_returns), 
            mean = mean(monthly_returns), 
            SD = sd(monthly_returns))  # min, max, median, mean, SD

Monthly_return
## # A tibble: 7 × 6
##   symbol    min   max  median    mean     SD
##   <chr>   <dbl> <dbl>   <dbl>   <dbl>  <dbl>
## 1 AAPL   -0.181 0.217 0.0230  0.0230  0.0791
## 2 ANF    -0.421 0.507 0.00105 0.00337 0.146 
## 3 DIS    -0.186 0.234 0.00725 0.0113  0.0721
## 4 DPZ    -0.194 0.342 0.0246  0.0270  0.0774
## 5 JPM    -0.229 0.202 0.0199  0.0119  0.0727
## 6 SPY    -0.125 0.127 0.0146  0.0106  0.0404
## 7 TSLA   -0.224 0.811 0.0117  0.0501  0.177

Plot a density plot, using geom_density(), for each of the stocks

# We created one plot that includes all stocks to facilitate comparisons 
ggplot(myStocks_returns_monthly, 
       aes(x = monthly_returns, color = symbol)) + 
      geom_density() + # plot density function
  labs(title = "Density plot of monthly returns for selected stocks", 
       x = "Monthly returns", y = "Density")

What can you infer from this plot? Which stock is the riskiest? The least risky?

Infer from this plot

  1. Except for AAPL, other stocks follow the Normal Distribution and are skewed right.
  2. Most stocks have positive monthly returns for their peaks. Apple has twin peaks with one that is slightly negative and one that is positive. TSLA has a negative monthly return for its peak, however, it is heavily skewed right.
  3. Comparing the SPF500 to the stocks, we clearly see that the SPF500 is as expected the least risky stock with most returns centered around a strong peak.

The riskiest stock

  1. Tesla appears to have the largest standard deviation because it spanned very widely. This is partly due to Tesla having had very many good months with very positive returns.
  2. However, comparing Abercrombie & Fitch with Tesla, Abercrombie had more months with very negative returns. Therefore, both stocks can be deemed as being quite risky. If you base your decision which is the riskiest stock on the standard deviation, then Tesla is the riskiest stock.

The least risky stock While DIS has the most concentrated distribution so it might be the first choice for conservative investors.

Finally, make a plot that shows the expected monthly return (mean) of a stock on the Y axis and the risk (standard deviation) in the X-axis. Please use ggrepel::geom_text_repel() to label each stock

# create the plot

ggplot(Monthly_return, aes(x=SD, y=mean)) + 
  geom_point() + 
  geom_text_repel(aes(label = symbol),
                     nudge_x = 1,
                     na.rm = TRUE) + 
  labs(title = "Examining the relationship between a stock's standard deviation and its monthly return",
       x = "Standard deviation", 
       y = "Expected monthly return")

What can you infer from this plot? Are there any stocks which, while being riskier, do not have a higher expected return?

TYPE YOUR ANSWER AFTER (AND OUTSIDE!) THIS BLOCKQUOTE.

3.1 infer

Generally speaking, higher risk brings higher return. This is because investors want to be reimbursed for the risk they are taking.

3.2 an exception

However, Abercrombie & Fitch has a high standard deviation but low expected return. One can infer that Abercrombie & Fitch as a company wasn’t very successful during this time period.

4 On your own: Spotify

spotify_songs <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv')

Produce a one-page summary describing this dataset. Here is a non-exhaustive list of questions:

  1. What is the distribution of songs’ popularity (track_popularity). Does it look like a Normal distribution?
# As the songs are duplicated across multiple playlists, it's efficient to find the distinct songs for better analysis
spotify_songs_unique <- spotify_songs %>% distinct(track_id, .keep_all = TRUE)
ggplot(spotify_songs_unique, aes(x = track_popularity)) + geom_histogram() +
  labs(title = "Distribution of track_popularity",
       x = "track_popularity",
       y = "frequency")

Even though it’s not a perfect bell curve, the distribution of track_popularity closely resembles a Normal distribution curve. However, the number of songs with zero popularity is high in number, causing a high spike in the curve.

  1. There are 12 audio features for each track, including confidence measures like acousticness, liveness, speechinessand instrumentalness, perceptual measures like energy, loudness, danceability and valence (positiveness), and descriptors like duration, tempo, key, and mode. How are they distributed? can you roughly guess which of these variables is closer to Normal just by looking at summary statistics?
# Initial analysis is done here, using the summary of given spotify data
spotify_songs_summary <- summary(spotify_songs)
# Use knitr library for adding titles and scrollbar to the table
knitr::kable(spotify_songs_summary, "html") %>% kable_styling("striped") %>% 
  kableExtra::scroll_box(width = "100%", height = "100%")
track_id track_name track_artist track_popularity track_album_id track_album_name track_album_release_date playlist_name playlist_id playlist_genre playlist_subgenre danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo duration_ms
Length:32833 Length:32833 Length:32833 Min. : 0.0 Length:32833 Length:32833 Length:32833 Length:32833 Length:32833 Length:32833 Length:32833 Min. :0.000 Min. :0.000 Min. : 0.00 Min. :-46.4 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.000 Min. : 0 Min. : 4000
Class :character Class :character Class :character 1st Qu.: 24.0 Class :character Class :character Class :character Class :character Class :character Class :character Class :character 1st Qu.:0.563 1st Qu.:0.581 1st Qu.: 2.00 1st Qu.: -8.2 1st Qu.:0.000 1st Qu.:0.041 1st Qu.:0.015 1st Qu.:0.000 1st Qu.:0.093 1st Qu.:0.331 1st Qu.:100 1st Qu.:187819
Mode :character Mode :character Mode :character Median : 45.0 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Median :0.672 Median :0.721 Median : 6.00 Median : -6.2 Median :1.000 Median :0.062 Median :0.080 Median :0.000 Median :0.127 Median :0.512 Median :122 Median :216000
NA NA NA Mean : 42.5 NA NA NA NA NA NA NA Mean :0.655 Mean :0.699 Mean : 5.37 Mean : -6.7 Mean :0.566 Mean :0.107 Mean :0.175 Mean :0.085 Mean :0.190 Mean :0.511 Mean :121 Mean :225800
NA NA NA 3rd Qu.: 62.0 NA NA NA NA NA NA NA 3rd Qu.:0.761 3rd Qu.:0.840 3rd Qu.: 9.00 3rd Qu.: -4.6 3rd Qu.:1.000 3rd Qu.:0.132 3rd Qu.:0.255 3rd Qu.:0.005 3rd Qu.:0.248 3rd Qu.:0.693 3rd Qu.:134 3rd Qu.:253585
NA NA NA Max. :100.0 NA NA NA NA NA NA NA Max. :0.983 Max. :1.000 Max. :11.00 Max. : 1.3 Max. :1.000 Max. :0.918 Max. :0.994 Max. :0.994 Max. :0.996 Max. :0.991 Max. :239 Max. :517810
spotify_songs_unique <- spotify_songs %>% distinct(track_id, .keep_all = TRUE)

# Plot each column to accurately determine the distribution of the data.
ggplot(spotify_songs_unique, aes(x = acousticness)) + geom_histogram() +
  labs(title = "Distribution of acousticness",
       x = "acousticness",
       y = "frequency")

ggplot(spotify_songs_unique, aes(x = liveness)) + geom_histogram() +
  labs(title = "Distribution of liveness",
       x = "liveness",
       y = "frequency")

ggplot(spotify_songs_unique, aes(x = speechiness)) + geom_histogram() +
  labs(title = "Distribution of speechiness",
       x = "speechiness",
       y = "frequency")

ggplot(spotify_songs_unique, aes(x = instrumentalness)) + geom_histogram() +
  labs(title = "Distribution of instrumentalness",
       x = "instrumentalness",
       y = "frequency")

ggplot(spotify_songs_unique, aes(x = energy)) + geom_histogram() +
  labs(title = "Distribution of energy",
       x = "energy",
       y = "frequency")

ggplot(spotify_songs_unique, aes(x = loudness)) + geom_histogram() +
  labs(title = "Distribution of loudness",
       x = "loudness",
       y = "frequency")

ggplot(spotify_songs_unique, aes(x = danceability)) + geom_histogram() +
  labs(title = "Distribution of danceability",
       x = "danceability",
       y = "frequency")

ggplot(spotify_songs_unique, aes(x = valence)) + geom_histogram() +
  labs(title = "Distribution of valence",
       x = "valence",
       y = "frequency")

ggplot(spotify_songs_unique, aes(x = duration_ms)) + geom_histogram() +
  labs(title = "Distribution of duration_ms",
       x = "duration_ms",
       y = "frequency")

ggplot(spotify_songs_unique, aes(x = tempo)) + geom_histogram() +
  labs(title = "Distribution of tempo",
       x = "tempo",
       y = "frequency")

A distribution is normal when it is distributed along a bell curve and the mode, mean and median are the same. In our opinion it is difficult to say just from the summary statistics whether a distribution is normal because we don’t have entire context of data. Even if a variable might have the same mean and median it doesn’t mean that it will necessarily be distributed normally.

In order to see whether a variable is normally distributed, we created histograms for the 12 variables. From these histograms we can see that some variables seem to be more normally distributed which goes hand in hand with the summary stats showing that when the mean and the median have lower difference, the curve has higher chance of being a normal curve.

  1. Is there any relationship between valence and track_popularity? danceability and track_popularity ?
# As the songs are duplicated across multiple playlists, it's efficient to find the distinct songs for better analysis
spotify_songs_unique <- spotify_songs %>% distinct(track_id, .keep_all = TRUE)

# Draw the scatter plot of valence and track_popularity, to understand the correlation.
track_popularity_and_valence <- ggplot(spotify_songs_unique, aes(x = valence, y = track_popularity)) + 
  geom_point() +
  geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95) +
  labs(title = "Scatter plot of valence and track_popularity",
       x = "valence",
       y = "track_popularity")
track_popularity_and_valence

# Draw the scatter plot of danceability and track_popularity, to understand the correlation.
track_popularity_and_danceability <- ggplot(spotify_songs_unique, aes(x = danceability, y = track_popularity)) + 
  geom_point() +
  geom_smooth(method="auto", se=TRUE, fullrange=FALSE, level=0.95) +
  labs(title = "Scatter plot of danceability and track_popularity",
       x = "danceability",
       y = "track_popularity")
track_popularity_and_danceability

So, according to our scatter plots we can see that there doesn’t seem to be a correlation between valence and track popularity or danceability and track popularity.

Valence and track_popularity have a straight line. Hence, no matter what the valence value is, the track popularity stays the same.

This is also the case for danceability and track popularity. Even though there might be a small increase in popularity when the track is more danceable, however its not statistically signifigant.

  1. mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0. Do songs written on a major scale have higher danceability compared to those in minor scale? What about track_popularity?
# As the songs are duplicated across multiple playlists, it's efficient to find the distinct songs for better analysis
spotify_songs_unique <- spotify_songs %>% distinct(track_id, .keep_all = TRUE)

# Plot the distribution of danceability, faceting by mode
danceability <- ggplot(spotify_songs_unique, aes(x = danceability)) + 
  geom_histogram() +
  facet_grid(~mode) + 
  labs(title = "Distribution of danceability in Major and Minor scale", 
       x = "danceability",
       y = "frequency")
danceability

# Plot the distribution of track_popularity, faceting by mode
track_popularity <- ggplot(spotify_songs_unique, aes(x = track_popularity)) + 
  geom_histogram() +
  facet_grid(~mode) + 
  labs(title = "Distribution of track_popularity in Major and Minor scale", 
       x = "track_popularity",
       y = "frequency")
track_popularity

There are slightly more songs on major scale that are danceable. Even though the danceability plot looks very similar whether it is in major or minor, there is a greater number of songs in major, hence, major songs have slightly higher danceability.

Looking at the graphs for track popularity we can see that it is more or less the same. The distributions are very similar but if the scale is major there is a higher count of popular songs.

However, since the curves are extremely similar in both major or minor, having a different scale doesn’t really add more value to danceability or track popularity.

5 Challenge 1: Replicating a chart

The purpose of this exercise is to reproduce a plot using your dplyr and ggplot2 skills. It builds on exercise 1, the San Francisco rentals data.

You have to create a graph that calculates the cumulative % change for 0-, 1-1, and 2-bed flats between 2000 and 2018 for the top twelve cities in Bay Area, by number of ads that appeared in Craigslist. Your final graph should look like this

# YOUR CODE GOES HERE
total_count <- as.numeric(count(rent))

# Retrieve the top 12 cities from the given dataset
top_12_cities <- rent %>% 
  group_by(city) %>% 
  summarise(total_count_city = n()) %>% 
  arrange(desc(total_count_city)) %>% 
  head(12)

# Find the median prices for the top 12 cities, grouping by city, beds and year.
city_rent <- rent %>% 
  filter(beds < 3, city %in% top_12_cities$city) %>% 
  group_by(city, beds, year) %>% 
  summarise(median_rental = median(price))

# Calculate the cumulative % change in the median rental prices.
final_solution <- city_rent %>% 
  group_by(city, beds) %>% 
  mutate(pct_change = (median_rental/lag(median_rental))) %>% 
  mutate(pct_change = ifelse(is.na(pct_change), 1, pct_change)) %>% 
  mutate(cumulative_change = cumprod(pct_change))

# Plot the findings on Line plot, faceting by beds and city.
ggplot(final_solution, aes(x=year, y=cumulative_change, color=city)) +
  geom_line() +
  facet_grid(beds ~ city) +
  labs(title = "Cumulative % change in 0,1, and 2-bed rentals in Bay Area",
       subtitle = "2000-2018",
       x = NULL, 
       y = NULL) + 
  theme(legend.position="none", axis.text.x = element_text(angle = 90))

6 Challenge 2: 2016 California Contributors plots

# Make sure you use vroom() as it is significantly faster than read.csv()
CA_contributors_2016 <- vroom::vroom(here::here("data","CA_contributors_2016.csv"))

zip_code_database <- vroom::vroom(here::here("data","zip_code_database.csv"))

# Set zip columns in both datasets to character to prepare for join
CA_contributors_2016 <- CA_contributors_2016 %>% 
  mutate(zip = as.character(zip))
zip_code_database_new <- zip_code_database %>% 
  mutate(zip = as.character(zip))

# Join datasets with a left join
CA_contributors_2016 <- left_join(CA_contributors_2016, zip_code_database_new, by="zip")

# Create Hillary plot
hillary <- CA_contributors_2016 %>%
  filter(cand_nm == "Clinton, Hillary Rodham") %>%
  group_by(primary_city) %>%
  summarise(total_amt = sum(contb_receipt_amt)) %>% # calculate total amount donated for Hillary in each city
  slice_max(order_by = total_amt, n=10) %>% # take top 10
  mutate(primary_city=fct_reorder(primary_city, total_amt)) %>% # order chart to in descending order
  ggplot(aes(x = total_amt, y=primary_city)) + # create plot
  geom_col(fill="blue") + 
  scale_x_continuous(labels = scales::dollar_format()) +
  labs(title = "Where did candidates raise most money?", subtitle = "Clinton, Hillary Rodham")

# Create Donald plot
trump <- CA_contributors_2016 %>%
  filter(cand_nm == "Trump, Donald J.") %>%
  group_by(primary_city) %>%
  summarise(total_amt = sum(contb_receipt_amt)) %>%
  slice_max(order_by = total_amt, n=10) %>%
  mutate(primary_city=fct_reorder(primary_city, total_amt)) %>%
  ggplot(
    mapping=aes(x = total_amt, y=primary_city)) + 
  geom_col(fill="red") +
  scale_x_continuous(labels = scales::dollar_format()) +
  labs(subtitle = "Trump, Donald J.")

# Join two plots together
hillary + theme_bw(base_size = 14) + labs(x = "Amount Raised", y = "Primary City") +
trump + theme_bw(base_size = 14) + labs(x = "Amount Raised", y = "Primary City")

# Create a plot for the top 10 candidates. We are not sure whether that is required but it worked for 10 candidates so it will work for two candidates as well

# Find the top 10 candidates
top_ten <- CA_contributors_2016 %>%
             group_by(cand_nm) %>%
             summarise(total_amt = sum(contb_receipt_amt)) %>%
             slice_max(order_by = total_amt, n=10)

# Filter for the top 10 candidates in the original dataset
CA_2016 <- CA_contributors_2016 %>%
  filter(cand_nm == top_ten$cand_nm) %>%
  group_by(cand_nm, primary_city) %>%
  summarise(total = sum(contb_receipt_amt))

# This is where the magic happens 
CA_2016 %>%
  group_by(cand_nm) %>%
  top_n(10) %>% # use function from tidytext
  ungroup %>% # reverse grouping
  mutate(primary_city = reorder_within(primary_city, total, cand_nm)) %>% # order the primary city column
  ggplot(aes(primary_city, total, fill = cand_nm)) + # add plot
    geom_col(show.legend = FALSE) +
    facet_wrap(~cand_nm, scales = "free_y") + # create one plot for each candidate
    coord_flip() +
    scale_x_reordered() +
    scale_y_continuous(expand = c(0,0)) + # include 0 in y axis 
    labs(title = "Top 10 candidates donations in the Bay Area", 
         x = "Total amount donated", y = "Cities")

7 Deliverables

There is a lot of explanatory text, comments, etc. You do not need these, so delete them and produce a stand-alone document that you could share with someone. Knit the edited and completed R Markdown file as an HTML document (use the “Knit” button at the top of the script editor window) and upload it to Canvas.

8 Details

  • Who did you collaborate with: Study group 4
  • Approximately how much time did you spend on this problem set: 7h per person
  • What, if anything, gave you the most trouble: Understanding the tidytext package